In both cases, we consider a quality threshold of 60, i.e. a quality of 20 with 3 repeats, or 30 with two repeats. This means that we expect to have fewer remaining bases when only 2 repeats are used, because many will not pass the quality threshold. The analyses have been performed using Acevedo's scripts, modified in the case with 2 repeats. When 2 repeats are analysed, only the first and the third repeats are used. The third repeat should have lower quality than the second, so this is a conservative way to look at the data.
In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
import matplotlib
colNames = ["position","referenceBase","A","C", "G", "T"]
counts = pd.read_csv ("outputPassage2/Q20threshold.txt", sep="\t", header=None, names=colNames)
counts2Repeats = pd.read_csv ("output2RepeatsPassage2/Q30threshold.txt", sep="\t", header=None, names=colNames)
#Utilitary variables
axisFontSize = 20
titleFontSize = 25
axisTickFontSize = 12
matplotlib.rc('xtick', labelsize=axisTickFontSize)
matplotlib.rc('ytick', labelsize=axisTickFontSize)
rcParams["axes.labelsize"]=axisFontSize
In [3]:
def coverage(a,c,g,t):
cov = a+c+g+t
return cov
def majorBaseRatio(a,c,g,t,coverage):
l=[a,c,g,t]
l.sort()
return l[3]/coverage
def secondMajorBaseRatio(a,c,g,t,coverage):
l=[a,c,g,t]
l.sort()
return l[2]/coverage
def thirdMajorBaseRatio(a,c,g,t,coverage):
l=[a,c,g,t]
l.sort()
return l[1]/coverage
def fourthMajorBaseRatio(a,c,g,t,coverage):
l=[a,c,g,t]
l.sort()
return l[0]/coverage
def ARatio(a,coverage):
return a/coverage
def CRatio(c,coverage):
return c/coverage
def GRatio(g,coverage):
return g/coverage
def TRatio(t,coverage):
return t/coverage
def computeCoverageAndRatios (counts):
counts["coverage"]= list ( map(coverage, counts["A"], counts["C"], counts["G"], counts["T"]) )
counts["majorBaseRatio"] = list ( map(majorBaseRatio, counts["A"], counts["C"], counts["G"], counts["T"], counts["coverage"]) )
counts["secondMajorBaseRatio"] = list ( map(secondMajorBaseRatio, counts["A"], counts["C"], counts["G"], counts["T"], counts["coverage"]) )
counts["thirdMajorBaseRatio"] = list ( map(thirdMajorBaseRatio, counts["A"], counts["C"], counts["G"], counts["T"], counts["coverage"]) )
counts["fourthMajorBaseRatio"] = list ( map(fourthMajorBaseRatio, counts["A"], counts["C"], counts["G"], counts["T"], counts["coverage"]) )
counts["ARatio"] = list ( map(ARatio, counts["A"], counts["coverage"]) )
counts["CRatio"] = list ( map(CRatio, counts["C"], counts["coverage"]) )
counts["GRatio"] = list ( map(GRatio, counts["G"], counts["coverage"]) )
counts["TRatio"] = list ( map(TRatio, counts["T"], counts["coverage"]) )
return
computeCoverageAndRatios (counts)
counts.describe()
computeCoverageAndRatios (counts2Repeats)
counts2Repeats.describe()
Out[3]:
In [4]:
%matplotlib inline
def plotCoverage(subplot, counts, title):
rcParams['figure.figsize'] = 18, 8
subplot.set_title(title, fontsize=15, fontweight='bold')
subplot.set_ylabel ('Number of reads')
subplot.plot (counts ['position'], counts ['coverage'], 'b-') #(position, coverage, 'r-')
fig_size = rcParams["figure.figsize"]
subplot.legend (['Sequencing coverage'], loc = 'upper left')
yaxis_lower_limit = counts ['coverage'].min()
yaxis_upper_limit = counts ['coverage'].max()
xaxis_lower_limit = counts ['position'].min()
xaxis_upper_limit = counts ['position'].max()
subplot.axis ([xaxis_lower_limit, xaxis_upper_limit, yaxis_lower_limit, yaxis_upper_limit])
f, axarr = plt.subplots(2, sharex=True,figsize=(15,15))
plotCoverage(axarr[0], counts,'Genome coverage by position, 3 repeats')
plotCoverage(axarr[1], counts2Repeats,'Genome coverage by position, 2 repeats')
plt.xlabel ('Genome position')
plt.tight_layout()
In [5]:
print("Average coverage for 3 repeats, quality threshold at 20: "+str(counts["coverage"].mean()))
print("Average coverage for 2 repeats, quality threshold at 30: "+str(counts2Repeats["coverage"].mean()))
In [6]:
%matplotlib inline
def plotFrequencyOfMostFrequentNucleotide (subplot, counts, title):
rcParams['figure.figsize'] = 18, 8
subplot.set_title(title, fontsize=15, fontweight='bold')
subplot.set_ylabel ('Frequency')
subplot.plot (counts ['position'], counts ['majorBaseRatio'], 'r.') #(position, coverage, 'r-')
fig_size = rcParams["figure.figsize"]
subplot.legend (['Frequency of the most frequent nucleotide'], loc = 'upper right')
yaxis_lower_limit = counts ['majorBaseRatio'].min()
yaxis_upper_limit = counts ['majorBaseRatio'].max()
xaxis_lower_limit = counts ['position'].min()
xaxis_upper_limit = counts ['position'].max()
subplot.axis ([xaxis_lower_limit, xaxis_upper_limit, yaxis_lower_limit, yaxis_upper_limit])
f, axarr = plt.subplots(2, sharex=True,figsize=(15,15))
plotFrequencyOfMostFrequentNucleotide(axarr[0], counts, 'Frequency of the most frequent nucleotide by genome position, 3 repeats')
plotFrequencyOfMostFrequentNucleotide(axarr[1], counts2Repeats, 'Frequency of the most frequent nucleotide by genome position, 2 repeats')
plt.xlabel ('Genome position')
plt.tight_layout()
In [7]:
%matplotlib inline
def runningMeanFast(x, N):
return np.convolve(x, np.ones((N,))/N)[(N-1):]
def plotRMFrequencyOfMostFrequentNucleotide (subplot, counts, title):
majorBaseRatioMovingAverage10 = (runningMeanFast (counts ['majorBaseRatio'], 10))
secondMajorBaseRatioMovingAverage10 = (runningMeanFast (counts ['secondMajorBaseRatio'], 10))
rcParams['figure.figsize'] = 18, 8
subplot.set_title(title, fontsize=15, fontweight='bold')
subplot.plot (counts ['position'], majorBaseRatioMovingAverage10, 'r-') #(position, coverage, 'r-')
subplot.plot (counts ['position'], 1-secondMajorBaseRatioMovingAverage10, 'b-') #(position, coverage, 'r-')
subplot.set_ylabel ('Frequency')
fig_size = rcParams["figure.figsize"]
subplot.legend (['Frequency of the most frequent nucleotide', 'Frequency of the second most frequent nucleotide'], loc = 'lower left')
yaxis_lower_limit = 0.995
yaxis_upper_limit = majorBaseRatioMovingAverage10.max()
xaxis_lower_limit = counts ['position'].min()
xaxis_upper_limit = counts ['position'].max()
subplot.axis ([xaxis_lower_limit, xaxis_upper_limit, yaxis_lower_limit, yaxis_upper_limit])
f, axarr = plt.subplots(2, sharex=True,figsize=(15,15))
plotRMFrequencyOfMostFrequentNucleotide(axarr[0], counts, 'Frequency of the most frequent nucleotide by genome position, 3 repeats')
plotRMFrequencyOfMostFrequentNucleotide(axarr[1], counts2Repeats, 'Frequency of the most frequent nucleotide by genome position, 2 repeats')
plt.xlabel ('Genome position')
plt.tight_layout()
In [8]:
%matplotlib inline
def plotFrequenciesOf3LeastFrequentNucleotides (subplot, counts, title):
rcParams['figure.figsize'] = 18, 8
subplot.set_title(title, fontsize=15, fontweight='bold')
subplot.plot (counts ['position'], counts ['secondMajorBaseRatio'], 'r.') #(position, coverage, 'r-')
subplot.plot (counts ['position'], counts ['thirdMajorBaseRatio'], 'b.') #(position, coverage, 'r-')
subplot.plot (counts ['position'], counts ['fourthMajorBaseRatio'], 'g.') #(position, coverage, 'r-')
subplot.set_ylabel ('Frequency')
fig_size = rcParams["figure.figsize"]
subplot.legend (['Frequency of the second most frequent nucleotide', 'Frequency of the third most frequent nucleotide', 'Frequency of the fourth most frequent nucleotide'], loc = 'upper left')
yaxis_lower_limit = counts ['fourthMajorBaseRatio'].min()
yaxis_upper_limit = 1
xaxis_lower_limit = counts ['position'].min()
xaxis_upper_limit = counts ['position'].max()
subplot.set_yscale('log')
subplot.axhline(y = 0.01, linewidth=1, linestyle='dashed', color='0.75')
subplot.axis ([xaxis_lower_limit, xaxis_upper_limit, yaxis_lower_limit, yaxis_upper_limit])
f, axarr = plt.subplots(2, sharex=True,figsize=(15,15))
plotFrequenciesOf3LeastFrequentNucleotides(axarr[0], counts, 'Frequency of the 3 least frequent nucleotide by genome position, 3 repeats')
plotFrequenciesOf3LeastFrequentNucleotides(axarr[1], counts2Repeats, 'Frequency of the 3 least frequent nucleotide by genome position, 2 repeats')
plt.xlabel ('Genome position')
plt.tight_layout()
In [9]:
import scipy
import math
def KullbackLeibler (p, q):
if len(p) != len(q):
print("Error in KL: lists with different lengths")
exit(-1)
kl = 0
for i in range (len(p)):
ratio = 1
if p[i] == 0 and q[i] == 0:
ratio = 1
elif p[i] == 0:
ratio = 0.0000000001/q[i]
elif q[i] == 0:
ratio = p[i]/0.0000000001
else:
ratio = p[i]/q[i]
kl = kl + p[i]*math.log(ratio)
return kl
def JensenShannon (p, q):
meanpq = [sum(x)/2 for x in zip(p, q)]
js = KullbackLeibler (p, meanpq) + KullbackLeibler (q, meanpq)
return js
def computeDivergences(count1, count2):
ratios1 = list(zip(count1['ARatio'], count1['CRatio'], count1['GRatio'], count1['TRatio']))
ratios2 = list(zip(count2['ARatio'], count2['CRatio'], count2['GRatio'], count2['TRatio']))
result=list()
for i in range(len(ratios1)):
result.append(JensenShannon (ratios1[i], ratios2[i]))
return result
counts2Repeats['Divergence'] = computeDivergences(counts2Repeats, counts)
In [10]:
counts2Repeats.describe()
Out[10]:
In [11]:
%matplotlib inline
def scatterPlotDivergenceVsCoverage (subplot, counts, title):
rcParams['figure.figsize'] = 18, 8
subplot.suptitle(title, fontsize=15, fontweight='bold')
orderDivergenceByCoverage = [x for (y,x) in sorted(zip(counts ['coverage'],counts ['Divergence']))]
smoothedCurve = (runningMeanFast (orderDivergenceByCoverage, 200))
subplot.plot (counts ['coverage'], counts ['Divergence'], 'r.', alpha=0.3)
subplot.plot (sorted(counts ['coverage']), smoothedCurve, '-', alpha=1, linewidth=2)
subplot.ylabel ('Divergence')
fig_size = rcParams["figure.figsize"]
subplot.legend (['Divergence'], loc = 'upper left')
yaxis_lower_limit = counts ['Divergence'].min()
yaxis_upper_limit = counts ['Divergence'].max()
xaxis_lower_limit = counts ['coverage'].min()
xaxis_upper_limit = counts ['coverage'].max()
#subplot.yscale('log')
#subplot.axhline(y = 0.01, linewidth=1, linestyle='dashed', color='0.75')
subplot.axis ([xaxis_lower_limit, xaxis_upper_limit, yaxis_lower_limit, yaxis_upper_limit])
#f, axarr = plt.subplots(1, sharex=True,figsize=(15,15))
scatterPlotDivergenceVsCoverage(plt, counts2Repeats, 'Divergence in nucleotide frequencies between experiences against coverage')
plt.xlabel ('Coverage')
plt.tight_layout()
In [12]:
%matplotlib inline
def scatterPlotRatioDifferenceVsCoverage (subplot, ratio1, ratio2, title):
rcParams['figure.figsize'] = 18, 8
subplot.set_title(title, fontsize=15, fontweight='bold')
diffRatio = pd.Series([(x-y)/(x+y) for (x,y) in zip(ratio1,ratio2)])
yaxis_lower_limit = diffRatio.min(skipna=True)
yaxis_upper_limit = diffRatio.max(skipna=True)
xaxis_lower_limit = counts ['coverage'].min()
xaxis_upper_limit = counts ['coverage'].max()
#diffRatio = abs((ratio1 - ratio2)/(ratio1)
orderdiffRatioByCoverage = [x for (y,x) in sorted(zip(counts ['coverage'],diffRatio))]
smoothedCurve = (runningMeanFast (orderdiffRatioByCoverage, 50))
subplot.plot (counts ['coverage'], diffRatio, 'r.', alpha=0.3)
subplot.plot (sorted(counts ['coverage']), smoothedCurve, '-', alpha=1, linewidth=2)
subplot.set_ylabel ('Frequency difference')
fig_size = rcParams["figure.figsize"]
subplot.legend (['Relative frequency difference'], loc = 'upper left')
#subplot.set_yscale('log')
subplot.locator_params(axis='y',nbins=6)
subplot.axhline(y = 0.00, linewidth=2, linestyle='dashed', color='0.1')
subplot.axis ([xaxis_lower_limit, xaxis_upper_limit, yaxis_lower_limit, yaxis_upper_limit])
f, axarr = plt.subplots(4, sharex=True,figsize=(15,15))
scatterPlotRatioDifferenceVsCoverage(axarr[0], counts['majorBaseRatio'], counts2Repeats['majorBaseRatio'], 'Relative difference in nucleotide frequencies between experiences against coverage, major allele')
scatterPlotRatioDifferenceVsCoverage(axarr[1], counts['secondMajorBaseRatio'], counts2Repeats['secondMajorBaseRatio'], 'Relative difference in nucleotide frequencies between experiences against coverage, second major allele')
scatterPlotRatioDifferenceVsCoverage(axarr[2], counts['thirdMajorBaseRatio'], counts2Repeats['thirdMajorBaseRatio'], 'Relative difference in nucleotide frequencies between experiences against coverage, third major allele')
scatterPlotRatioDifferenceVsCoverage(axarr[3], counts['fourthMajorBaseRatio'], counts2Repeats['fourthMajorBaseRatio'], 'Relative difference in nucleotide frequencies between experiences against coverage, fourth major allele')
plt.xlabel ('Coverage')
plt.tight_layout()
Therefore there is a trade-off: having more repeats increases the quality for all bases, but decreases the number of informative bases. We need to look further (e.g. into our real data) to investigate whether 2 repeats could be enough.
In [ ]: